package crawler;
import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.crawler.OnCrawlerVisitListener;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequester;
import crawler.classes.Article;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.*;

public class CrawlerBlog extends DeepCrawler {
	public int wait_time = 1000;

	@Override
	public HttpRequester getHttpRequester() {
		return super.getHttpRequester();
	}

	public String type = "blog";
	public String key_word;
	public String next_link_regex=".*page=\\d+.*";
	public ArrayList<String> current_urls = new ArrayList<String>();
	public ArrayList<String> current_articles = new ArrayList<String>();
	public boolean isUpdate;
	public CrawlerBlog(String crawlPath, String key_word,boolean isUpdate) {
		super(crawlPath);
		this.isUpdate = isUpdate;
		ArticlesService.SaveKeyWord(key_word);
		this.key_word = key_word;
		//关键字尝试encode
		this.addSeed("http://search.sina.com.cn/?q=" + URLEncoder.encode(key_word) + "&c=blog");

	}
	@Override
	public Links visitAndGetNextLinks(Page page) {
		extractArticles(page);
		try {
			System.out.println("暂停时间:" + wait_time / 1000 + " 秒");
			Thread.sleep(this.wait_time);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		Links ls = getPageLinks(page);
		return ls;
	}

	private Links getPageLinks(Page page) {
		Links ls = new Links();
		Elements elements = page.getDoc().select(".pagebox").select("a");
		for (Element a : elements) {
			if (a.attr("href").matches(this.next_link_regex)) {
				String url = "http://search.sina.com.cn" + a.attr("href");
				if (!this.current_urls.contains(url)) {
					ls.add(url);
					System.out.println("添加下次采集链接:" + a.attr("href"));
					this.current_urls.add(url);
				}
			}
		}
		return ls;
	}

	public void extractArticles(Page page) {
		Document doc = page.getDoc();
		List<Article> articleList = new ArrayList<Article>();
		String url;
		String title;
		String content;
		String src_html;
		String create_time;
		String full_content="";
		String full_html="";

		Elements elements = doc.select(".box-result");
		for (Element element : elements) {
			url = element.select(".r-info-blog-tit").get(0).getElementsByTag("a").attr("href");
			title = element.select(".r-info-blog-tit").get(0).text();
			content = element.select(".content").text();
			src_html = element.select(".content").html();
			create_time = element.select(".fgray_time").text();
			if (url != null && title != null && content != null && src_html != null && create_time != null
					&& !this.current_articles.contains(url)) {
				Document fulldoc= null;
				try {
					fulldoc = Jsoup.connect(url).get();
					full_content=fulldoc.text();
					full_html=fulldoc.html();
				} catch (IOException e) {
					e.printStackTrace();
				}

				Article article = new Article(this.type, this.key_word, url, title, content, src_html, create_time,full_content,full_html);
				// article.printInfo();
				articleList.add(article);
				current_articles.add(url);
				if (onCrawlerVisitListener != null) {
					onCrawlerVisitListener.onVisit(url);
				}
				this.totalSize=current_articles.size();
			}

		}

		int updateSize = ArticlesService.saveMore(articleList,isUpdate);
		if(isUpdate){
			if (onCrawlerVisitListener != null) {
				onCrawlerVisitListener.onUpdate(updateSize);
			}
		}
		System.out.println("break");

	}


	public static void main(String[] args) throws Exception {
		CrawlerBlog crawler = new CrawlerBlog("./tmp", "仕邦",false);
		crawler.next_link_regex = ".*page=\\d+.*";
		crawler.current_urls = new ArrayList<String>();
		crawler.setThreads(2);
		crawler.setOnCrawlerVisitListener(new OnCrawlerVisitListener() {
			
			@Override
			public int onVisit(String url) {
				return 0;
			}
			@Override
			public int onComplete(int count) {
				return 0;
			}
			@Override
			public int onUpdate(int updateSize) {
				// TODO Auto-generated method stub
				return 0;
			}
		});
		crawler.start(20);
	}


}
